In [1]:
import json
import datetime
import numpy as np

folder = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/outfiles'

counts = np.zeros(10000)
count = 0
for file_num in range(793):
    if file_num < 670 and file_num > 659:
        continue
    filename = folder + '/file' + str(file_num) + '.txt'
    for line in open(filename):
        line_obj = json.loads(line)
        for code_block in line_obj['CodeBlocks']:
            counts[:len(code_block['code'])] += 1
            #if len(code_block) > 5000:
            #    count += 1

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(counts)


Out[2]:
[<matplotlib.lines.Line2D at 0x7fc4c40ea4a8>]

In [3]:
code_longer_than_threshold = []
for file_num in range(793):
    filename = folder + '/file' + str(file_num) + '.txt'
    for line in open(filename):
        line_obj = json.loads(line)
        for code_block in line_obj['CodeBlocks']:
            if len(code_block['code']) > 10000:
                code_longer_than_threshold.append(code_block['code'])

In [5]:
len(code_longer_than_threshold)


Out[5]:
3826

In [26]:
experiment_code = code_longer_than_threshold[:2000]

In [27]:
import os
threshold = '10000_run2000'
os.mkdir('plagiarism/plag' + str(threshold))
os.mkdir('plagiarism/plag' + str(threshold) + '/inputs')
os.mkdir('plagiarism/plag' + str(threshold) + '/outputs')

In [28]:
import os
for i in range(len(experiment_code)):
    base_name = 'plagiarism/plag' + str(threshold) + '/inputs/file' + str(i) + '.py'
    f = open(base_name,'w')
    f.write(experiment_code[i])
    f.close()

In [30]:
import os
jar_file = '../jplag-2.11.9-SNAPSHOT-jar-with-dependencies.jar'
lang = 'python3'
results = 'ouputs'
inputs = 'inputs'
command = "java -jar " + jar_file + " -l " + lang + " -r " + results + " -s " + inputs + " -m 200"
bash_file_text = '''start=$(date +%s)
touch experiment_cleaned.out
''' + command + " > experiment_cleaned.out" + '''
end=$(date +%s)
runtime=$(python -c "print ('%u:%02u' % ((${end} - ${start})/60, (${end} - ${start})%60))")
echo $runtime
'''
bash_file = open('plagiarism/plag' + str(threshold) + '/run.sh', 'w')
bash_file.write(bash_file_text)


Out[30]:
293

Results


In [33]:
#Running at a threshold of 10000, with 100   files: 1:44
#Running at a threshold of 10000, with 200   files: 3:32
#Running at a threshold of 10000, with 500   files: 6:16
#Running at a threshold of 10000, with 1000  files: 9:33
#Running at a threshold of 10000, with 2000  files: 17:49
#Running at a threshold of 10000, with ~3800 files: 46:54
results = [100,200,500,1000,2000,3850]
results_y = [104, 212, 376, 573, 1069, 2814]
plt.scatter(results, results_y)
plt.xlabel('Number of Files')
plt.ylabel('Number of seconds to completion')


Out[33]:
Text(0,0.5,'Number of seconds to completion')

In [ ]:


In [ ]: